#!/usr/local/bin/pythonw
# -*- coding: utf-8 -*-

import re
import string
#import chardet
import glob
import os

class Corpus_index :
  def __init__(self,_dir,_file) :
    path = _dir + _file
    try :
      content = open(path).read()
      self.index = self.content2index(content)
    except Exception, e :
      self.index = self.dir2index(_dir)
     
  def content2index(self,_content) :
    index = {}
    list_lines = _content.split('\n')
    research = re.compile(u'http://([^/]*)', re.I)
    for i, line in enumerate(list_lines) :
      search = re.search(research, line)
      if search :
        domain = search.group(1)
        if(not index.has_key(domain)) :
          index[domain] = []
        index[domain].append(i+1)
    return index

  def dir2index(self,_dir) :
    list_files = glob.glob(_dir + "/*.html")
    index = {_dir:[]}
    for i,_ in enumerate(list_files) : 
      index[_dir].append(i+1)
    return index 

  def move_corpus(self,dir_tgt) :
    for domain,list_i in self.index.iteritems() :
      dir_domain = os.path.join(dir_tgt,domain)
      if(os.path.isdir(dir_domain) == False) :
        os.makedirs(dir_domain)

      for i in list_i :
        filename = 'fichier%i.html'%(i)
        filepath = os.path.join(dir_corpus,filename)
        newfilepath = os.path.join(dir_domain,filename)
        try :
          shutil.copyfile(filepath,newfilepath)
        except :
          print "error : %s"%(filepath)


